HW 03

Author

Trevor Macdonald

Code
# Load and install only the libraries actually used in HW-03
if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  # Core tidyverse + file/path handling
  tidyverse,    # dplyr, ggplot2, readr, etc.
  here,         # relative file paths
  janitor,      # clean_names()
  glue,         # string interpolation
  lubridate,    # dates
  scales,       # percent_format(), comma()

  # Plotting & annotations
  ggrepel,      # better geom_text labels
  ggthemes,     # extra themes
  cowplot,      # for draw_plot() and draw_grob()
  ggpmisc,      # for stat_poly_eq etc. (optional but fine to keep)
  colorspace,   # accessibility palettes

  # Image & raster
  jpeg,         # to read JPEG parchment
  grid,         # for rasterGrob()

  # Likert & challenge-specific
  dsbox,        # gglikert()
  ggstats       # used in setup/testing
)

# Global plot style
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# Console display width
options(width = 65)

# Global chunk settings
knitr::opts_chunk$set(
  fig.width = 7,
  fig.asp = 0.618,
  fig.retina = 3,
  fig.align = "center",
  dpi = 300
)

1 - Du Bois challenge.

Code
# source of parchment background: https://pixabay.com/photos/paper-old-texture-parchment-1074131/

# Parchment background
paper <- readJPEG(here("data", "paper.jpg"))

# Convert image to a raster object
paper_raster <- rasterGrob(paper, width = unit(1, "npc"), height = unit(1, "npc"))

# Read income data
income <- read_csv(here("data", "income.csv"))

#Fix Label downstream
income <- income |> 
  mutate(Class = recode(Class,
    "$1000 AND OVER" = "$1,000 AND OVER",
    "$750-1000" = "$750-1,000"
  ))

# Clean
income_long <- income |>
  mutate(
    # Fix specific values for bottom class
    Tax   = if_else(Class == "$100-200", 0.1, Tax),
    Other = if_else(Class == "$100-200", 9.9, Other)
  )|>
  pivot_longer(
    cols = Rent:Other,
    names_to = "category",
    values_to = "percent_income"
  ) |>
  filter(percent_income != 0) |>
  mutate(
    category = toupper(category),  # Capitalize category for legend
    Class = fct_relevel(Class,  
      "$1,000 AND OVER",
      "$750-1,000",
      "$500-750",
      "$400-500",
      "$300-400",
      "$200-300",
      "$100-200"
    ),
    category = fct_relevel(category,  # Reorder spending categories
      "OTHER", "TAX", "CLOTHES", "FOOD", "RENT"
    )
  )
# Add label color
income_long <- income_long |>
  mutate(
    text_color_group = if_else(category == "RENT", "white", "black")
  )
Code
# Inspired by: https://rpubs.com/leeolney/duboischallenge

# Main plot object
income_plot <- ggplot(
  income_long,                                 # Use the cleaned long-format income data
  aes(x = percent_income, y = Class, fill = category)  # Stack percentage by category within each income Class
) +

  # Bar Chart Object
  geom_bar(
    stat = "identity",           # Use actual percent_income values
    position = "fill",          
    width = 0.5,                
    color = "black",             # Outline
    size = 0.2                 
  ) +

  # Percentage centered inside bar labels
  geom_text(
    data = income_long |> 
      filter(!(Class == "$100-200" & category == "TAX")),  # Exclude 0.1% label
    aes(label = paste0(percent_income, "%"), color = text_color_group),  # White or black depending on fill
    position = position_fill(vjust = 0.5),  # Center in block
    size = 3,                               
    fontface = "bold",                     
    show.legend = FALSE                    # Do not show in legend
  ) +

  # Right side y-axis 
  geom_text(
  data = income_long |> distinct(Class, Average_Income),
  aes(x = 1.07, y = Class, label = paste0("$", comma(Average_Income, accuracy = 1))),
  inherit.aes = FALSE,
  hjust = 0,
  size = 4,
  family = "mono",
  color = "black"
) +
  
  # Left side y-axis 
  geom_text(
  data = income_long |> distinct(Class),
  aes(x = -0.1, y = Class, label = Class),
  inherit.aes = FALSE,
  hjust = 1,
  size = 4,
  family = "mono",
  color = "black"
) +

  # Left y-axis Class 
  annotate(
    "text",
    x = -0.1,   # Slightly left of the start of bars
    y = 7.7,      # Just above topmost row (manual adjustment)
    label = "CLASS",
    hjust = 1,    # Left-aligned
    vjust = 1,    # Top-aligned
    size = 4,
    family = "mono",
    color = "black"
  ) +

  # Right y-axis Average income 
  annotate(
    "text",
    x = 1.07,     # Align with income labels
    y = 8,       
    label = "AVERAGE\nINCOME",  
    hjust = 0,
    vjust = 1,
    size = 4,
    family = "mono",
    color = "black"
  ) +

  # Manual color to match example 
  scale_fill_manual(
    name = NULL,
    values = c(
      "OTHER"   = "#cbdfbd",   # Greenish
      "TAX"     = "#8e9aaf",   # Blue-gray
      "CLOTHES" = "#d78879",   # Coral
      "FOOD"    = "#a08294",   # Purple-gray
      "RENT"    = "black"      # Black for rent
    ),
    guide = guide_legend(reverse = TRUE)  # Reverse order to match bar stacking
  ) +

  scale_color_manual(
    values = c("black" = "black", "white" = "white") # Need to define for color text_color_group
) +
  # Scaling
  scale_x_continuous(
    expand = expansion(mult = c(0.25, 0.25))  # Add space left/right of bars
  ) +

theme_void() + # Void and add back what is needed
theme(
  legend.position = "top",    
  legend.box.margin = margin(t = 20),  # Push legend down from top of plot
  plot.title = element_text(
    hjust = 0.5,
    face = "bold"
  )
) +

  labs(
    title = "INCOME AND EXPENDITURE OF 150 NEGRO FAMILIES IN ATLANTA, GA., U.S.A.",
    x = NULL,
    y = NULL
  )

# Plot with parchment object
final_plot <- cowplot::ggdraw(xlim = c(0, 1), ylim = c(0, 1), clip = "off") +
  draw_grob(paper_raster, 0, 0, 2, 2) +
  draw_plot(income_plot)
            

final_plot

This plot was VERY difficult to reproduce.

2 - COVID survey - interpret

Differences by Profession
Nursing students exhibit higher variance in responses to question 1, aligning with my intuition that medical students generally have better access to information and the ability to interpret medical data more accurately. This trend reverses for Questions 4–6, where medical students show both a higher mean Likert score and greater variance in response to statements related to vaccine safety and the approval process. This could indicate that medical students access to clinical knowledge allows for a more informed response. It’s important to note that none of the 90th percentiles exceed a Likert score of 3, suggesting that this disagreement might actually reflect neutrality rather than strong dissent.

Gender Differences
There appears to be no meaningful difference between male and female students across all questions. I found this result surprising given that nursing is a predominantly female profession, while medical students are predominately male. This contradicts the profession based differences observed in my first response. How would the relationship change if a second analysis was done controlling for these factors?

3 - COVID survey - reconstruct

Code
# Load covid-survey data skipping row 1. row 2 will then become column names. 
covid_survey <- read_csv(here("data", "covid-survey.csv" ), skip = 1)
dim(covid_survey)# Dimension
[1] 1121   14
Code
#glimpse(covid_survey) # View columns and type

# Filter and count NA rows
covid_survey_NA <- covid_survey |>
  filter(if_all(-response_id, is.na)) # The - in front of response_id was debugged by GPT
nrow(covid_survey_NA)# Count NA rows
[1] 10
Code
# Drop rows where all are NA
covid_survey_filtered <- covid_survey |>
 filter(!if_all(-response_id, is.na)) # The - in front of response_id was debugged by GPT
dim(covid_survey_filtered) # Matched dim for dropped rows. 
[1] 1111   14
Code
# Factor
covid_survey_factored <- covid_survey_filtered |>
  mutate(
    exp_already_vax = recode_factor(exp_already_vax, `0` = "No", `1` = "Yes"),
    exp_flu_vax     = recode_factor(exp_flu_vax,     `0` = "No", `1` = "Yes"),
# Debugged numeric back tick instead of "" using GPT.
    exp_profession = recode_factor(exp_profession, `0` = "Medical", `1` = "Nursing"),
# Gender
    exp_gender = recode_factor(
      exp_gender,
      `0` = "Male",
      `1` = "Female",
      `3` = "Non-binary",
      `4` = "Prefer not to say"
    ),
# Race 
    exp_race = recode_factor(
      exp_race,
      `1` = "American Indian / Alaskan Native",
      `2` = "Asian",
      `3` = "Black / African American",
      `4` = "Pacific Islander",
      `5` = "White"
    ),
# Hispanic/Non-Hispanic
    exp_ethnicity = recode_factor(
      exp_ethnicity,
      `1` = "Hispanic / Latino",
      `2` = "Non-Hispanic / Non-Latino"
    ),
# Age bin
    exp_age_bin = recode_factor(
      exp_age_bin,
      `0`  = "<20",
      `20` = "21–25",
      `25` = "26–30",
      `30` = ">30"
    )
)
dim(covid_survey_factored)
[1] 1111   14
Code
covid_survey_longer <- covid_survey_factored |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )

# Check
#covid_survey_longer
Code
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low  = quantile(response_value, 0.10, na.rm = TRUE),
    high = quantile(response_value, 0.90, na.rm = TRUE),  
    .groups = "drop" 
    )

# Check result
#covid_survey_summary_stats_by_group
Code
covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),       
    low  = quantile(response_value, 0.10, na.rm = TRUE),  
    high = quantile(response_value, 0.90, na.rm = TRUE),
    explanatory = "All",            # Add fixed value directly
    explanatory_value = factor("") ,# Placeholder
    #.groups = "drop"
  ) |>
  select(response, mean, low, high, explanatory, explanatory_value)  # Column order

#Check
#covid_survey_summary_stats_all
Code
# Combine grouped and overall summary stats into tibble
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_by_group,
  covid_survey_summary_stats_all
) |> 
  select(response, mean, low, high, explanatory, explanatory_value) # Order
Code
# Relabel 
covid_survey_summary_stats <- covid_survey_summary_stats |>
  mutate(
    response = recode(response,
      "resp_safety"              = "Based on my understanding, I believe the vaccine is safe",
      "resp_confidence_science"  = "I am confident in the scientific vetting process for the new COVID vaccines",
      "resp_concern_safety"      = "I am concerned about the safety and side effects of the vaccine",
      "resp_feel_safe_at_work"   = "Getting the vaccine will make me feel safer at work",
      "resp_will_recommend"      = "I will recommend the vaccine to family, friends, and community members",
      "resp_trust_info"          = "I trust the information that I have received about the vaccines",
    ),

    explanatory = recode(explanatory,
      "exp_age_bin"         = "Age",               
      "exp_gender"          = "Gender",
      "exp_race"            = "Race",
      "exp_ethnicity"       = "Ethnicity",
      "exp_profession"      = "Profession",
      "exp_already_vax"     = "Had COVID Vaccine",
      "exp_flu_vax"         = "Had flu vaccine this year",
    )
  )

covid_survey_summary_stats <- covid_survey_summary_stats |>
  mutate(
    explanatory = fct_relevel(
      explanatory,
      "All",
      "Age",
      "Gender",
      "Race",
      "Ethnicity",
      "Profession",
      "Had COVID Vaccine",
      "Had flu vaccine this year",
    )
  )
Code
ggplot(
  covid_survey_summary_stats,
  aes(x = mean, y = explanatory_value)
) +
  geom_point(size = 1) +
  geom_errorbarh(
    aes(xmin = low, xmax = high), 
    height = 0.2,
    linewidth = 0.4
  ) +
  facet_grid(
    rows = vars(explanatory),
    cols = vars(response),
    scales = "free_y", # I could not get these to render properly.
    space = "free_y",  # Found the solution in lecture slides.
    labeller = labeller(
      explanatory = label_wrap_gen(15),
      response = label_wrap_gen(15)
    )
  ) +
    theme_void() +
theme(
  axis.text.x = element_text(size = 9),
  axis.title.x = element_text(size = 10),
  axis.text.y = element_text(size = 9, hjust = 1),
  strip.background = element_rect(fill = "gray90", color = "grey30"),
    ) +
  labs(
    x = "Mean Likert score\n(Error bars show 10th–90th percentile)",
  )

4 - COVID survey - re-reconstruct

Code
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low  = quantile(response_value, 0.25, na.rm = TRUE),
    high = quantile(response_value, 0.75, na.rm = TRUE),  
    .groups = "drop" 
  )

# Combine new grouped and overall summary stats 
re_covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_by_group,
  covid_survey_summary_stats_all
) |> 
  select(response, mean, low, high, explanatory, explanatory_value) 

# Relabel 
re_covid_survey_summary_stats <- re_covid_survey_summary_stats |>
  mutate(
    response = recode(response,
      "resp_safety"              = "Based on my understanding, I believe the vaccine is safe",
      "resp_confidence_science"  = "I am confident in the scientific vetting process for the new COVID vaccines",
      "resp_concern_safety"      = "I am concerned about the safety and side effects of the vaccine",
      "resp_feel_safe_at_work"   = "Getting the vaccine will make me feel safer at work",
      "resp_will_recommend"      = "I will recommend the vaccine to family, friends, and community members",
      "resp_trust_info"          = "I trust the information that I have received about the vaccines",
    ),

    explanatory = recode(explanatory,
      "exp_age_bin"         = "Age",               
      "exp_gender"          = "Gender",
      "exp_race"            = "Race",
      "exp_ethnicity"       = "Ethnicity",
      "exp_profession"      = "Profession",
      "exp_already_vax"     = "Had COVID Vaccine",
      "exp_flu_vax"         = "Had flu vaccine this year",
    )
  )

re_covid_survey_summary_stats <- re_covid_survey_summary_stats |>
  mutate(
    explanatory = fct_relevel(
      explanatory,
      "All",
      "Age",
      "Gender",
      "Race",
      "Ethnicity",
      "Profession",
      "Had COVID Vaccine",
      "Had flu vaccine this year",
    )
  )

ggplot(
  re_covid_survey_summary_stats,
  aes(x = mean, y = explanatory_value)
) +
  geom_point(size = 1) +
  geom_errorbarh(
    aes(xmin = low, xmax = high), 
    height = 0.2,
    linewidth = 0.4
  ) +
  facet_grid(
    rows = vars(explanatory),
    cols = vars(response),
    scales = "free_y", # I could not get these to render properly.
    space = "free_y",  # Found the solution in lecture slides.
    labeller = labeller(
      explanatory = label_wrap_gen(15),
      response = label_wrap_gen(15)
    )
  ) +
    theme_void() +
theme(
  axis.text.x = element_text(size = 9),
  axis.title.x = element_text(size = 10),
  axis.text.y = element_text(size = 9, hjust = 1),
  strip.background = element_rect(fill = "gray90", color = "grey30"),
    ) +
  labs(
    x = "Mean Likert score\n(Error bars show 25th–75th percentile)",
  )

Changing Quartile

Switching to the 25th–75th percentile unsurprisingly compresses the visual range of responses. What first looked like strong disagreement (higher variance) now appears as mild uncertainty or even neutrality. The central narrative holds across both versions of the analysis plot. The underlying relationship between distributions across profession and gender doesn’t change. If anything, nuance gets lost when using wider percentiles, so I’d argue the 25th–75th percentile is a more accurate representation in this case.

5 - COVID survey - another view

Code
# source: https://larmarange.github.io/ggstats/articles/gglikert.html

# Recode Likert responses and compute percentage
covid_likert_plot_data <- covid_survey_longer |>
  filter(!is.na(response_value)) |>
  mutate(
    response_value = factor(response_value, levels = c("1", "2", "3", "4", "5")),
    response = recode(response,
      "resp_safety"             = "The vaccine is safe",
      "resp_confidence_science" = "Trust the science",
      "resp_concern_safety"     = "Concerned about safety",
      "resp_feel_safe_at_work"  = "Feel safe at work",
      "resp_will_recommend"     = "Will recommend vaccine",
      "resp_trust_info"         = "Trust received information"
    ),
    response = fct_relevel(response,  
      "Concerned about safety",
      "The vaccine is safe",
      "Trust the science",
      "Trust received information",
      "Feel safe at work",
      "Will recommend vaccine"
    )
  ) |> # Used GPT to debug this logic. 
  group_by(response, response_value) |>
  summarise(n = n(), .groups = "drop") |>
  group_by(response) |>
  mutate(percent = n / sum(n)) |>
  ungroup()

# Wide format for gglikert()
covid_survey_wide <- covid_survey_longer |>
  mutate(
    response_value = factor(
      as.character(response_value),
      levels = c("1", "2", "3", "4", "5")
    ),
    response = recode(response,
      "resp_safety"             = "The vaccine is safe",
      "resp_confidence_science" = "Trust the science",
      "resp_concern_safety"     = "Concerned about safety",
      "resp_feel_safe_at_work"  = "Feel safe at work",
      "resp_will_recommend"     = "Will recommend vaccine",
      "resp_trust_info"         = "Trust received information"
    ),
    response = fct_relevel(response,  
      "Concerned about safety",
      "The vaccine is safe",
      "Trust the science",
      "Trust received information",
      "Feel safe at work",
      "Will recommend vaccine"
    ),
    response = fct_rev(response) # Fix stacking downstream
  ) |>
  group_by(response_id, response) |>
  slice(1) |>  # Used GPT to debug this pipeline
  ungroup() |>
  select(response_id, response, response_value) |>
  pivot_wider(
    names_from = response,
    values_from = response_value
  )
Code
gglikert(
  data = select(covid_survey_wide, -response_id),
  add_labels = FALSE,          # Remove percentage labels
  #reverse_likert = FALSE,      # Reverse Stack
  add_totals = FALSE            # Remove Total labels
) +
  scale_fill_manual(
    values = c(
      "1" = "#FFFFCC",
      "2" = "#A1DAB4",
      "3" = "#41B6C4",
      "4" = "#2C7FB8",
      "5" = "#253494"
    ),
    labels = c(
      "1" = "1 – Strongly Agree",
      "2" = "2 – Somewhat Agree",
      "3" = "3 – Neutral",
      "4" = "4 – Somewhat Disagree",
      "5" = "5 – Strongly Disagree"
    ),
    name = NULL
  ) +
  labs(
    title = "COVID Vaccine Survey Response",
    subtitle = "Diverging Bar Chart Likert Scale",
    x = NULL, 
    y = NULL
  ) +
  theme_minimal(base_size = 14 # Not sure why it didn't use global settings
  ) + 
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "bottom"
  )

5a A diverging stacked bar chart showing responses to six COVID-19 vaccine statements on a Likert scale. Each horizontal bar is divided into five segments representing responses from “1 – Strongly Agree” to “5 – Strongly Disagree”. The color palette chosen is an accessibility friendly light yellow to dark blue. Agreement responses extend right from center and disagreement responses extend left. The Data shows strong sentiment for trust and safety. The last response “concerned about safety” reinforces the conclusion drawn by the other questions.

Code
ggplot(covid_likert_plot_data, aes(
  x = percent,
  y = response,
  fill = response_value
)) +
  geom_col(
    width = 0.7,
    position = position_fill(reverse = TRUE)  # Reverse stack direction to match legend 1-5
  ) +
  scale_x_continuous(
    labels = scales::percent_format()
  ) +
scale_fill_manual(
    values = c(
      "1" = "#FFFFCC",
      "2" = "#A1DAB4",
      "3" = "#41B6C4",
      "4" = "#2C7FB8",
      "5" = "#253494"
    ),
    labels = c(
      "1" = "1 – Strongly Agree",
      "2" = "2 – Somewhat Agree",
      "3" = "3 – Neutral",
      "4" = "4 – Somewhat Disagree",
      "5" = "5 – Strongly Disagree"
    ),
    name = NULL
  ) +
  labs(
    title = "Covid Vaccine Survey Response",
    subtitle = " 100% Bar Chart Likert Scale",
    x = NULL,
    y = NULL
  ) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "bottom"
  )

5b 100% Bar Chart showing responses to six COVID-19 vaccine statements on a Likert scale. Each horizontal bar is divided into five segments representing responses from “1 – Strongly Agree” to “5 – Strongly Disagree”. The color palette chosen is an accessibility friendly light yellow to dark blue. Agreement responses extend right from center and disagreement responses extend left. The Chart shows the larger variance in vaccine safety concerns while all other responses seem to have a concentration of responses in 1-2 range.

5c Both charts visualize the same Likert data. The diverging bar chart emphasizes the direction of responses by centering the neutral response, making it easier to detect skew toward agreement or disagreement. The 100% stacked bar chart standardizes each row’s total, allowing clearer comparison of proportional distributions across all response levels. The diverging chart highlights polarity and contentiousness, while the 100% Bar chart better supports detailed breakdowns.